show code
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
arrow,
readr,
tidyverse,
lubridate,
readxl,
rvest,
tidyr,
plotly,
scales,
sf,
janitor,
stringr,
patchwork,
purrr
)if (!require("pacman")) install.packages("pacman")
pacman::p_load(
arrow,
readr,
tidyverse,
lubridate,
readxl,
rvest,
tidyr,
plotly,
scales,
sf,
janitor,
stringr,
patchwork,
purrr
)# Download complaint data
complaint_data <- read_csv("data/NYPD_Complaint_Data_Historic_20251019.csv")
complaint_data_full <- read_parquet("data/NYPD_Complaint_Data_Historic_20251021.parquet") |>
clean_names()
evictions <- read_csv_arrow("data/Evictions_20251019.csv")
df_full_evictions <- read_csv_arrow("data/Evictions_20251021.csv") |>
clean_names()# Parse ukrainian crime rates data:
url <- "https://uk.wikipedia.org/wiki/Злочинність_в_Україні"
page <- read_html(url)
tabs <- page |>
html_elements("table.wikitable")
crime_ukr <- tabs[[1]] |>
html_table(fill = TRUE)# First group complaint dataset by year and find out number of complaints each year:
complaint_summary <- complaint_data |>
mutate(
date = mdy(CMPLNT_FR_DT),
year = year(date)
) |>
filter(year >= 1900) |>
rename(
NYC_district = BORO_NM
)
# Group by year and borough, count number in each group
by_year_borough <- evictions |>
mutate(
exec_date = mdy(`Executed Date`),
year = year(exec_date),
borough = toupper(BOROUGH)
) |>
count(year, borough, name = "n") |>
arrange(year, borough) |>
collect()
# Clean and process ukrainian crime rates data (add extra 2 missing rows):
crime_ukr <- crime_ukr |>
filter(!is.na(рік), рік >= 2010) |>
mutate(
total_crimes = str_replace_all(`всього злочинів`, "\\D", ""),
total_crimes = na_if(total_crimes, ""), # keep blanks as NA
total_crimes = as.integer(total_crimes)
) |>
select(рік, total_crimes)
crime_ukr <- add_row(crime_ukr, рік = 2024, total_crimes = 382335)
crime_ukr <- add_row(crime_ukr, рік = 2025, total_crimes = 473662)ggplot(crime_ukr, aes(x = рік, y = total_crimes / 1000)) +
geom_line(color = "#00CDCD", size = 1.2) +
geom_point(color = "#FFB90F", size = 2.5) +
scale_x_continuous(breaks = crime_ukr$рік) +
labs(title = "Total number of crimes committed each year in Ukraine",
x = NULL,
y = "Total crimes (in thousands)") +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
panel.grid.minor.x = element_blank()
)Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

crime_year <- complaint_summary |>
filter(!is.na(year), year >= 2000, year <= 2025) |>
count(year, name = 'complaints') |>
arrange(year)
complaint_vis <- ggplot(crime_year, aes(x = year, y = complaints, group = 1, text = paste0(
'year: ', year,
'\ncomplaints: ', comma(complaints)
))) +
geom_line(color = '#2F539B', linewidth = 1) +
scale_y_continuous(labels = comma) +
scale_x_continuous(breaks = pretty_breaks(n = 5)) +
labs(
title = 'NYC crime complaints by year',
x = 'year', y = 'number of complaints'
) +
theme_minimal() +
theme(
plot.background = element_rect(fill = '#F5F5F5'),
panel.background = element_rect(fill = '#F5F5F5'),
panel.grid.major.y = element_line(color = "#728FCE"),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
text = element_text(face = 'italic')
)
complaint_interactive <- ggplotly(complaint_vis, tooltip = 'text') |>
layout(
hovermode = 'x unified',
yaxis = list(tickformat = ',d')
)
complaint_interactiveeviction_monthly <- df_full_evictions |>
select(borough, executed_date) |>
mutate(executed_date = as_date(executed_date, format = "%m/%d/%Y")) |>
filter(
executed_date >= "2016-01-01" & executed_date <= "2025-12-31",
!is.na(borough),
borough != "Citywide"
) |>
mutate(borough = str_to_title(borough)) |>
mutate(
month_year = floor_date(executed_date, "month")
) |>
group_by(month_year) |>
summarise(total_count = n()) |>
collect() |>
mutate(type = "Evictions")
crime_monthly <- complaint_data_full |>
select(boro_nm, cmplnt_fr_dt, law_cat_cd) |>
mutate(cmplnt_fr_dt = as_date(cmplnt_fr_dt, format = "%m/%d/%Y")) |>
dplyr::filter(
cmplnt_fr_dt >= "2016-01-01" & cmplnt_fr_dt <= "2025-12-31",
!is.na(boro_nm),
law_cat_cd == "FELONY"
) |>
mutate(borough = str_to_title(boro_nm)) |>
mutate(
month_year = floor_date(cmplnt_fr_dt, "month")
) |>
group_by(month_year) |>
summarise(total_count = n()) |>
collect() |>
mutate(type = "Felonies")
bind_rows(eviction_monthly, crime_monthly) |>
ggplot(aes(x = month_year, y = total_count, color = type)) +
geom_line(linewidth = 1) +
facet_wrap(~ type, ncol = 1, scales = "free_y") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
scale_color_manual(values = c("Evictions" = "#1f78b4", "Felonies" = "#e31a1c")) +
labs(
title = paste("Total Monthly Evictions vs. Felonies in (2016-2025)"),
x = "Date",
y = "Total Count"
) +
theme_minimal() +
theme(legend.position = "none")
target_borough <- "Bronx"
eviction_monthly <- df_full_evictions |>
select(borough, executed_date) |>
mutate(executed_date = as_date(executed_date, format = "%m/%d/%Y")) |>
dplyr::filter(
executed_date >= "2016-01-01" & executed_date <= "2025-12-31",
!is.na(borough),
borough != "Citywide"
) |>
mutate(borough = str_to_title(borough)) |>
dplyr::filter(borough == target_borough) |>
mutate(
month_year = floor_date(executed_date, "month")
) |>
group_by(month_year) |>
summarise(total_count = n()) |>
collect() |>
mutate(type = "Evictions")
crime_monthly <- complaint_data_full |>
select(boro_nm, cmplnt_fr_dt, law_cat_cd) |>
mutate(cmplnt_fr_dt = as_date(cmplnt_fr_dt, format = "%m/%d/%Y")) |>
dplyr::filter(
cmplnt_fr_dt >= "2016-01-01" & cmplnt_fr_dt <= "2025-12-31",
!is.na(boro_nm),
law_cat_cd == "FELONY"
) |>
mutate(borough = str_to_title(boro_nm)) |>
dplyr::filter(borough == target_borough) |>
mutate(
month_year = floor_date(cmplnt_fr_dt, "month")
) |>
group_by(month_year) |>
summarise(total_count = n()) |>
collect() |>
mutate(type = "Felonies")
bind_rows(eviction_monthly, crime_monthly) |>
ggplot(aes(x = month_year, y = total_count, color = type)) +
geom_line(linewidth = 1) +
facet_wrap(~ type, ncol = 1, scales = "free_y") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
scale_color_manual(values = c("Evictions" = "#1f78b4", "Felonies" = "#e31a1c")) +
labs(
title = paste("Monthly Evictions vs. Felonies in", target_borough, "(2019-2023)"),
x = "Date",
y = "Total Count"
) +
theme_minimal() +
theme(legend.position = "none")
eviction_points <- df_full_evictions |>
select(borough, latitude, longitude, executed_date) |>
mutate(executed_date = as_date(executed_date, format = "%m/%d/%Y")) |>
dplyr::filter(
year(executed_date) == 2019,
!is.na(latitude), !is.na(longitude)
) |>
mutate(
latitude = as.numeric(latitude),
longitude = as.numeric(longitude)
) |>
dplyr::filter(
latitude > 40.4 & latitude < 40.9,
longitude > -74.3 & longitude < -73.6
) |>
collect()
crime_points <- complaint_data_full |>
select(c(boro_nm, latitude, longitude, cmplnt_fr_dt, law_cat_cd)) |>
mutate(
cmplnt_fr_dt = as_date(cmplnt_fr_dt, format = "%m/%d/%Y"),
latitude = as.numeric(latitude),
longitude = as.numeric(longitude)
) |>
dplyr::filter(
year(cmplnt_fr_dt) == 2019,
law_cat_cd == "FELONY",
!is.na(latitude), !is.na(longitude)
) |>
dplyr::filter(
latitude > 40.4 & latitude < 40.9,
longitude > -74.3 & longitude < -73.6
) |>
collect()
plot_evictions <- ggplot() +
stat_density_2d(
data = eviction_points,
aes(x = longitude, y = latitude, fill = ..level..),
geom = "polygon"
) +
scale_fill_viridis_c(option = "magma") +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Eviction Hotspots in 2019")
plot_felonies <- ggplot() +
stat_density_2d(
data = crime_points,
aes(x = longitude, y = latitude, fill = ..level..),
geom = "polygon"
) +
scale_fill_viridis_c(option = "magma") +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Felony Hotspots in 2019")
plot_evictions + plot_felonies
crime_by_year_borough <- complaint_data_full |>
mutate(
date = mdy(cmplnt_fr_dt),
year = year(date),
borough = toupper(boro_nm)
) |>
filter(!is.na(year), year >= 2000, year <= 2025) |>
group_by(year, borough) |>
summarize(complaints = n(), .groups = "drop") |>
mutate(year = as.integer(year)) |>
complete(
year = 2000:2025,
borough = c("MANHATTAN","BROOKLYN","QUEENS","BRONX","STATEN ISLAND"),
fill = list(complaints = 0)
)
ev_by_year_boro <- df_full_evictions |>
mutate(
exec_date = mdy(executed_date),
year = year(exec_date),
borough = toupper(borough)
) |>
count(year, borough, name = "evict")
bi <- crime_by_year_borough |>
left_join(ev_by_year_boro, by = c("year","borough")) |>
mutate(evict = replace_na(evict, 0)) |>
group_by(year) |>
mutate(
c_q = ntile(complaints, 3),
e_q = ntile(evict, 3),
bi_key = paste(c_q, e_q, sep = "-")
) |>
ungroup() |>
collect()
nyc_boroughs <- st_read("data/nybb.shp")Reading layer `nybb' from data source `/home/taras/Documents/Workspace/DS/crime-analysis/data/nybb.shp' using driver `ESRI Shapefile'
Simple feature collection with 5 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
Projected CRS: NAD83 / New York Long Island (ftUS)
nyc_boroughs <- nyc_boroughs |>
mutate(borough = toupper(BoroName)) |>
select(borough, geometry)
map_bi <- nyc_boroughs |>
left_join(bi, by = "borough") |>
mutate(id = row_number())
bi_pal <- c(
"1-1"="#e8e8e8","2-1"="#b8d6be","3-1"="#73ae80",
"1-2"="#bed8ec","2-2"="#90b2b3","3-2"="#5a9178",
"1-3"="#64acbe","2-3"="#4a7c8c","3-3"="#3b5d70"
)
nyc_boroughs_static <- map_bi |>
group_by(borough) |>
slice(1) |>
ungroup() |>
select(borough, geometry)
p_bi_static <- ggplot(map_bi) +
geom_sf(data = nyc_boroughs_static, fill = NA, color = "black", linewidth = 0.5, inherit.aes = FALSE) +
geom_sf(aes(fill = bi_key, frame = year, ids = id), color = NA, linewidth = 0.3) +
scale_fill_manual(values = bi_pal, na.value = "grey80", name = "complaints × evictions") +
labs(
title = "Relation of crime complaints and evictions",
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 16, hjust = 0.5, margin = margin(t = 5, b = 5)),
legend.position = "right",
plot.background = element_rect(fill = "white", color = NA)
)
p_bi_interactive <- ggplotly(p_bi_static, tooltip = "text")
p_bi_interactive